/*
 * Cppcheck - A tool for static C/C++ code analysis
 * Copyright (C) 2007-2024 Cppcheck team.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifdef HAVE_RULES

#include "regex.h"

#include <utility>

#ifdef _WIN32
#define PCRE_STATIC
#endif
#include <pcre.h>

namespace {
    std::string pcreErrorCodeToString(const int pcreExecRet)
    {
        switch (pcreExecRet) {
        case PCRE_ERROR_NULL:
            return "Either code or subject was passed as NULL, or ovector was NULL "
                   "and ovecsize was not zero (PCRE_ERROR_NULL)";
        case PCRE_ERROR_BADOPTION:
            return "An unrecognized bit was set in the options argument (PCRE_ERROR_BADOPTION)";
        case PCRE_ERROR_BADMAGIC:
            return "PCRE stores a 4-byte \"magic number\" at the start of the compiled code, "
                   "to catch the case when it is passed a junk pointer and to detect when a "
                   "pattern that was compiled in an environment of one endianness is run in "
                   "an environment with the other endianness. This is the error that PCRE "
                   "gives when the magic number is not present (PCRE_ERROR_BADMAGIC)";
        case PCRE_ERROR_UNKNOWN_NODE:
            return "While running the pattern match, an unknown item was encountered in the "
                   "compiled pattern. This error could be caused by a bug in PCRE or by "
                   "overwriting of the compiled pattern (PCRE_ERROR_UNKNOWN_NODE)";
        case PCRE_ERROR_NOMEMORY:
            return "If a pattern contains back references, but the ovector that is passed "
                   "to pcre_exec() is not big enough to remember the referenced substrings, "
                   "PCRE gets a block of memory at the start of matching to use for this purpose. "
                   "If the call via pcre_malloc() fails, this error is given. The memory is "
                   "automatically freed at the end of matching. This error is also given if "
                   "pcre_stack_malloc() fails in pcre_exec(). "
                   "This can happen only when PCRE has been compiled with "
                   "--disable-stack-for-recursion (PCRE_ERROR_NOMEMORY)";
        case PCRE_ERROR_NOSUBSTRING:
            return "This error is used by the pcre_copy_substring(), pcre_get_substring(), "
                   "and pcre_get_substring_list() functions (see below). "
                   "It is never returned by pcre_exec() (PCRE_ERROR_NOSUBSTRING)";
        case PCRE_ERROR_MATCHLIMIT:
            return "The backtracking limit, as specified by the match_limit field in a pcre_extra "
                   "structure (or defaulted) was reached. "
                   "See the description above (PCRE_ERROR_MATCHLIMIT)";
        case PCRE_ERROR_CALLOUT:
            return "This error is never generated by pcre_exec() itself. "
                   "It is provided for use by callout functions that want to yield a distinctive "
                   "error code. See the pcrecallout documentation for details (PCRE_ERROR_CALLOUT)";
        case PCRE_ERROR_BADUTF8:
            return "A string that contains an invalid UTF-8 byte sequence was passed as a subject, "
                   "and the PCRE_NO_UTF8_CHECK option was not set. If the size of the output vector "
                   "(ovecsize) is at least 2, the byte offset to the start of the the invalid UTF-8 "
                   "character is placed in the first element, and a reason code is placed in the "
                   "second element. The reason codes are listed in the following section. For "
                   "backward compatibility, if PCRE_PARTIAL_HARD is set and the problem is a truncated "
                   "UTF-8 character at the end of the subject (reason codes 1 to 5), "
                   "PCRE_ERROR_SHORTUTF8 is returned instead of PCRE_ERROR_BADUTF8";
        case PCRE_ERROR_BADUTF8_OFFSET:
            return "The UTF-8 byte sequence that was passed as a subject was checked and found to "
                   "be valid (the PCRE_NO_UTF8_CHECK option was not set), but the value of "
                   "startoffset did not point to the beginning of a UTF-8 character or the end of "
                   "the subject (PCRE_ERROR_BADUTF8_OFFSET)";
        case PCRE_ERROR_PARTIAL:
            return "The subject string did not match, but it did match partially. See the "
                   "pcrepartial documentation for details of partial matching (PCRE_ERROR_PARTIAL)";
        case PCRE_ERROR_BADPARTIAL:
            return "This code is no longer in use. It was formerly returned when the PCRE_PARTIAL "
                   "option was used with a compiled pattern containing items that were not supported "
                   "for partial matching. From release 8.00 onwards, there are no restrictions on "
                   "partial matching (PCRE_ERROR_BADPARTIAL)";
        case PCRE_ERROR_INTERNAL:
            return "An unexpected internal error has occurred. This error could be caused by a bug "
                   "in PCRE or by overwriting of the compiled pattern (PCRE_ERROR_INTERNAL)";
        case PCRE_ERROR_BADCOUNT:
            return "This error is given if the value of the ovecsize argument is negative "
                   "(PCRE_ERROR_BADCOUNT)";
        case PCRE_ERROR_RECURSIONLIMIT:
            return "The internal recursion limit, as specified by the match_limit_recursion "
                   "field in a pcre_extra structure (or defaulted) was reached. "
                   "See the description above (PCRE_ERROR_RECURSIONLIMIT)";
        case PCRE_ERROR_DFA_UITEM:
            return "PCRE_ERROR_DFA_UITEM";
        case PCRE_ERROR_DFA_UCOND:
            return "PCRE_ERROR_DFA_UCOND";
        case PCRE_ERROR_DFA_WSSIZE:
            return "PCRE_ERROR_DFA_WSSIZE";
        case PCRE_ERROR_DFA_RECURSE:
            return "PCRE_ERROR_DFA_RECURSE";
        case PCRE_ERROR_NULLWSLIMIT:
            return "PCRE_ERROR_NULLWSLIMIT";
        case PCRE_ERROR_BADNEWLINE:
            return "An invalid combination of PCRE_NEWLINE_xxx options was "
                   "given (PCRE_ERROR_BADNEWLINE)";
        case PCRE_ERROR_BADOFFSET:
            return "The value of startoffset was negative or greater than the length "
                   "of the subject, that is, the value in length (PCRE_ERROR_BADOFFSET)";
        case PCRE_ERROR_SHORTUTF8:
            return "This error is returned instead of PCRE_ERROR_BADUTF8 when the subject "
                   "string ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD option is set. "
                   "Information about the failure is returned as for PCRE_ERROR_BADUTF8. "
                   "It is in fact sufficient to detect this case, but this special error code for "
                   "PCRE_PARTIAL_HARD precedes the implementation of returned information; "
                   "it is retained for backwards compatibility (PCRE_ERROR_SHORTUTF8)";
        case PCRE_ERROR_RECURSELOOP:
            return "This error is returned when pcre_exec() detects a recursion loop "
                   "within the pattern. Specifically, it means that either the whole pattern "
                   "or a subpattern has been called recursively for the second time at the same "
                   "position in the subject string. Some simple patterns that might do this "
                   "are detected and faulted at compile time, but more complicated cases, "
                   "in particular mutual recursions between two different subpatterns, "
                   "cannot be detected until run time (PCRE_ERROR_RECURSELOOP)";
        case PCRE_ERROR_JIT_STACKLIMIT:
            return "This error is returned when a pattern that was successfully studied "
                   "using a JIT compile option is being matched, but the memory available "
                   "for the just-in-time processing stack is not large enough. See the pcrejit "
                   "documentation for more details (PCRE_ERROR_JIT_STACKLIMIT)";
        case PCRE_ERROR_BADMODE:
            return "This error is given if a pattern that was compiled by the 8-bit library "
                   "is passed to a 16-bit or 32-bit library function, or vice versa (PCRE_ERROR_BADMODE)";
        case PCRE_ERROR_BADENDIANNESS:
            return "This error is given if a pattern that was compiled and saved is reloaded on a "
                   "host with different endianness. The utility function pcre_pattern_to_host_byte_order() "
                   "can be used to convert such a pattern so that it runs on the new host (PCRE_ERROR_BADENDIANNESS)";
        case PCRE_ERROR_DFA_BADRESTART:
            return "PCRE_ERROR_DFA_BADRESTART";
#if PCRE_MAJOR >= 8 && PCRE_MINOR >= 32
        case PCRE_ERROR_BADLENGTH:
            return "This error is given if pcre_exec() is called with a negative value for the length argument (PCRE_ERROR_BADLENGTH)";
        case PCRE_ERROR_JIT_BADOPTION:
            return "This error is returned when a pattern that was successfully studied using a JIT compile "
                   "option is being matched, but the matching mode (partial or complete match) does not correspond "
                   "to any JIT compilation mode. When the JIT fast path function is used, this error may be "
                   "also given for invalid options. See the pcrejit documentation for more details (PCRE_ERROR_JIT_BADOPTION)";
#endif
        }
        return "unknown PCRE error " + std::to_string(pcreExecRet);
    }

    class PcreRegex : public Regex
    {
    public:
        explicit PcreRegex(std::string pattern)
            : mPattern(std::move(pattern))
        {}

        ~PcreRegex() override
        {
            if (mExtra) {
                pcre_free(mExtra);
                mExtra = nullptr;
            }
            if (mRe) {
                pcre_free(mRe);
                mRe = nullptr;
            }
        }

        std::string compile();
        std::string match(const std::string& str, const MatchFn& match) const override;

    private:
        std::string mPattern;
        pcre* mRe{};
        pcre_extra* mExtra{};
    };

    std::string PcreRegex::compile()
    {
        if (mRe)
            return "pcre_compile failed: regular expression has already been compiled";

        const char *pcreCompileErrorStr = nullptr;
        int erroffset = 0;
        pcre * const re = pcre_compile(mPattern.c_str(),0,&pcreCompileErrorStr,&erroffset,nullptr);
        if (!re) {
            if (pcreCompileErrorStr)
                return "pcre_compile failed: " + std::string(pcreCompileErrorStr);
            return "pcre_compile failed: unknown error";
        }

        // Optimize the regex, but only if PCRE_CONFIG_JIT is available
#ifdef PCRE_CONFIG_JIT
        const char *pcreStudyErrorStr = nullptr;
        pcre_extra * const pcreExtra = pcre_study(re, PCRE_STUDY_JIT_COMPILE, &pcreStudyErrorStr);
        // pcre_study() returns NULL for both errors and when it can not optimize the regex.
        // The last argument is how one checks for errors.
        // It is NULL if everything works, and points to an error string otherwise.
        if (pcreStudyErrorStr) {
            // pcre_compile() worked, but pcre_study() returned an error. Free the resources allocated by pcre_compile().
            pcre_free(re);
            return "pcre_study failed: " + std::string(pcreStudyErrorStr);
        }
        mExtra = pcreExtra;
#endif

        mRe = re;

        return "";
    }

    std::string PcreRegex::match(const std::string& str, const MatchFn& match) const
    {
        if (!mRe)
            return "pcre_exec failed: regular expression has not been compiled yet";

        int pos = 0;
        int ovector[30]= {0};
        while (pos < static_cast<int>(str.size())) {
            const int pcreExecRet = pcre_exec(mRe, mExtra, str.c_str(), static_cast<int>(str.size()), pos, 0, ovector, 30);
            if (pcreExecRet == PCRE_ERROR_NOMATCH)
                return "";
            if (pcreExecRet < 0) {
                return "pcre_exec failed (pos: " + std::to_string(pos) + "): " + pcreErrorCodeToString(pcreExecRet);
            }
            const auto pos1 = static_cast<unsigned int>(ovector[0]);
            const auto pos2 = static_cast<unsigned int>(ovector[1]);

            match(pos1, pos2);

            // jump to the end of the match for the next pcre_exec
            pos = static_cast<int>(pos2);
        }

        return "";
    }
}

std::shared_ptr<Regex> Regex::create(std::string pattern, std::string& err)
{
    auto* regex = new PcreRegex(std::move(pattern));
    err = regex->compile();
    if (!err.empty()) {
        delete regex;
        return nullptr;
    }
    return std::shared_ptr<Regex>(regex);
}

#endif // HAVE_RULES
